<!DOCTYPE html>












  


<html class="theme-next gemini use-motion" lang="Chinese">
<head>
  <meta charset="UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2"/>
<meta name="theme-color" content="#222">












<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />






















<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />

<link href="/css/main.css?v=6.4.0" rel="stylesheet" type="text/css" />


  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=6.4.0">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=6.4.0">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png?v=6.4.0">


  <link rel="mask-icon" href="/images/logo.svg?v=6.4.0" color="#222">









<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Gemini',
    version: '6.4.0',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: false,
    fastclick: false,
    lazyload: false,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>


  




  <meta name="description" content="需求分析爬取图书（特别是小说）数据并存储，用作之后的分析。 可以爬取的站点有Amazon、京东、当当、腾讯旗下的一众小说网站（腾讯文学、创世中文、云起书院）、起点中文网等等。    爬虫选择根据自己熟悉的语言来调查分析网上流行的爬虫。 并最终根据自身情况选择了Scrapy。   以云起书院为例来分析相关网站的模式根据分析可以看出来，网站的层级有三层：图书列表，图书内容和图书评论。在爬取的时候可以根">
<meta name="keywords" content="Python,Scrapy,增量,爬虫">
<meta property="og:type" content="article">
<meta property="og:title" content="使用Scrapy写个增量式图书爬虫">
<meta property="og:url" content="http://blog.zhengzi.me/2016/07/29/e4-bd-bf-e7-94-a8scrapy-e5-86-99-e4-b8-aa-e5-a2-9e-e9-87-8f-e5-bc-8f-e5-9b-be-e4-b9-a6-e7-88-ac-e8-99-ab/index.html">
<meta property="og:site_name" content="政子的博客">
<meta property="og:description" content="需求分析爬取图书（特别是小说）数据并存储，用作之后的分析。 可以爬取的站点有Amazon、京东、当当、腾讯旗下的一众小说网站（腾讯文学、创世中文、云起书院）、起点中文网等等。    爬虫选择根据自己熟悉的语言来调查分析网上流行的爬虫。 并最终根据自身情况选择了Scrapy。   以云起书院为例来分析相关网站的模式根据分析可以看出来，网站的层级有三层：图书列表，图书内容和图书评论。在爬取的时候可以根">
<meta property="og:locale" content="Chinese">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫选择.png">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫需求.png">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫框架比较.png">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/Scrapy.png">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片1.png">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片2-1024x877.png">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片3.png">
<meta property="og:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片4.png">
<meta property="og:updated_time" content="2018-08-21T09:49:42.773Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="使用Scrapy写个增量式图书爬虫">
<meta name="twitter:description" content="需求分析爬取图书（特别是小说）数据并存储，用作之后的分析。 可以爬取的站点有Amazon、京东、当当、腾讯旗下的一众小说网站（腾讯文学、创世中文、云起书院）、起点中文网等等。    爬虫选择根据自己熟悉的语言来调查分析网上流行的爬虫。 并最终根据自身情况选择了Scrapy。   以云起书院为例来分析相关网站的模式根据分析可以看出来，网站的层级有三层：图书列表，图书内容和图书评论。在爬取的时候可以根">
<meta name="twitter:image" content="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫选择.png">






  <link rel="canonical" href="http://blog.zhengzi.me/2016/07/29/e4-bd-bf-e7-94-a8scrapy-e5-86-99-e4-b8-aa-e5-a2-9e-e9-87-8f-e5-bc-8f-e5-9b-be-e4-b9-a6-e7-88-ac-e8-99-ab/"/>



<script type="text/javascript" id="page.configurations">
  CONFIG.page = {
    sidebar: "",
  };
</script>

  <title>使用Scrapy写个增量式图书爬虫 | 政子的博客</title>
  









  <noscript>
  <style type="text/css">
    .use-motion .motion-element,
    .use-motion .brand,
    .use-motion .menu-item,
    .sidebar-inner,
    .use-motion .post-block,
    .use-motion .pagination,
    .use-motion .comments,
    .use-motion .post-header,
    .use-motion .post-body,
    .use-motion .collection-title { opacity: initial; }

    .use-motion .logo,
    .use-motion .site-title,
    .use-motion .site-subtitle {
      opacity: initial;
      top: initial;
    }

    .use-motion {
      .logo-line-before i { left: initial; }
      .logo-line-after i { right: initial; }
    }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="Chinese">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">政子的博客</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
    
      
        <p class="site-subtitle">强迫症|学习者|伪文青</p>
      
    
  </div>

  <div class="site-nav-toggle">
    <button aria-label="Toggle navigation bar">
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>



<nav class="site-nav">
  
    <ul id="menu" class="menu">
      
        
        
        
          
          <li class="menu-item menu-item-home">
    <a href="/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-home"></i> <br />Home</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-about">
    <a href="/about/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-user"></i> <br />About</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-tags">
    <a href="/tags/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-tags"></i> <br />Tags</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-archives">
    <a href="/archives/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-archive"></i> <br />Archives</a>
  </li>

      
      
    </ul>
  

  

  
</nav>



  



</div>
    </header>

    


    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          
            

          
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://blog.zhengzi.me/2016/07/29/e4-bd-bf-e7-94-a8scrapy-e5-86-99-e4-b8-aa-e5-a2-9e-e9-87-8f-e5-bc-8f-e5-9b-be-e4-b9-a6-e7-88-ac-e8-99-ab/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="Jason Bian">
      <meta itemprop="description" content="">
      <meta itemprop="image" content="/images/avatar.jpg">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="政子的博客">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">使用Scrapy写个增量式图书爬虫
              
            
          </h1>
        

        <div class="post-meta">
          <span class="post-time">

            
            
            

            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">Posted on</span>
              

              
                
              

              <time title="Created: 2016-07-29 21:40:32" itemprop="dateCreated datePublished" datetime="2016-07-29T21:40:32+08:00">2016-07-29</time>
            

            
              

              
                
                <span class="post-meta-divider">|</span>
                

                <span class="post-meta-item-icon">
                  <i class="fa fa-calendar-check-o"></i>
                </span>
                
                  <span class="post-meta-item-text">Edited on</span>
                
                <time title="Modified: 2018-08-21 17:49:42" itemprop="dateModified" datetime="2018-08-21T17:49:42+08:00">2018-08-21</time>
              
            
          </span>

          
            <span class="post-category" >
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">In</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing"><a href="/categories/Python/" itemprop="url" rel="index"><span itemprop="name">Python</span></a></span>

                
                
                  , 
                
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing"><a href="/categories/Python/爬虫/" itemprop="url" rel="index"><span itemprop="name">爬虫</span></a></span>

                
                
              
            </span>
          

          
            
          

          
          

          

          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <h2 id="需求分析"><a href="#需求分析" class="headerlink" title="需求分析"></a>需求分析</h2><p>爬取图书（特别是小说）数据并存储，用作之后的分析。 可以爬取的站点有Amazon、京东、当当、腾讯旗下的一众小说网站（腾讯文学、创世中文、云起书院）、起点中文网等等。 <a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫选择.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫选择.png" alt="爬虫选择"></a> <a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫需求.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫需求.png" alt="爬虫需求"></a> </p>
<h2 id="爬虫选择"><a href="#爬虫选择" class="headerlink" title="爬虫选择"></a>爬虫选择</h2><p>根据自己熟悉的语言来调查分析网上流行的爬虫。 并最终根据自身情况选择了Scrapy。 <a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫框架比较.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/爬虫框架比较.png" alt="爬虫框架比较"></a> <a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/Scrapy.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/Scrapy.png" alt="Scrapy"></a></p>
<h2 id="以云起书院为例来分析相关网站的模式"><a href="#以云起书院为例来分析相关网站的模式" class="headerlink" title="以云起书院为例来分析相关网站的模式"></a>以云起书院为例来分析相关网站的模式</h2><p><a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片1.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片1.png" alt="图片1"></a>根据分析可以看出来，网站的层级有三层：图书列表，图书内容和图书评论。在爬取的时候可以根据三层的内容分别设计Item和数据库并进行爬取和存储。</p>
<h2 id="Scrapy工作方式"><a href="#Scrapy工作方式" class="headerlink" title="Scrapy工作方式"></a>Scrapy工作方式</h2><p><a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片2.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片2-1024x877.png" alt="图片2"></a> 有许多地方Scrapy框架都自动完成了，需要做的就是定义要爬取的URL以及将URL里面的内容使用Selector匹配道，在Pipeline中做数据清洗并且将Item里面的内容存入数据库或者文件。</p>
<h2 id="分层增量爬取架构"><a href="#分层增量爬取架构" class="headerlink" title="分层增量爬取架构"></a>分层增量爬取架构</h2><p><a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片3.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片3.png" alt="图片3"></a>增量爬取的方法就是分别对每一层写一个Spider，分别进行爬取，爬取之后进行存储并加入标识，之后更新的时候只更新新加入的图书而不去重新将所有的图书重新爬取。</p>
<h2 id="利用Shell指令实现自动增量爬取"><a href="#利用Shell指令实现自动增量爬取" class="headerlink" title="利用Shell指令实现自动增量爬取"></a>利用Shell指令实现自动增量爬取</h2><h3 id="实现爬虫每天自动运行"><a href="#实现爬虫每天自动运行" class="headerlink" title="实现爬虫每天自动运行"></a>实现爬虫每天自动运行</h3><p>//Bash</p>
<p>#crontab – l<br>5 0,12 <em> </em> * /root/cronList.sh</p>
<p>//cronList.sh</p>
<p>#!/bin/bash<br>cd /root/spider/yunqi<br>bash /root/spider/yunqi/run.sh</p>
<h3 id="每次开启任务，按照广度优先进行分层爬取"><a href="#每次开启任务，按照广度优先进行分层爬取" class="headerlink" title="每次开启任务，按照广度优先进行分层爬取"></a>每次开启任务，按照广度优先进行分层爬取</h3><p>//run.sh<br>DATE=$(date +%Y-%m-%d.%H:%M:%S)<br>echo  yunqi Spider Strat $DATE &gt;&gt; log.out<br>scrapy crawl novelList<br>echo yunqi novelList complete &gt;&gt;log.out<br>scrapy crawl novelContent<br>echo yunqi novelContent complete &gt;&gt;log.out<br>scrapy crawl novelComment<br>echo yunqi novelComment complete &gt;&gt;log.out<br>DATE=$(date +%Y-%m-%d.%H:%M:%S)<br>echo  yunqi Spider Finish $DATE &gt;&gt; log.out</p>
<h2 id="运行效果"><a href="#运行效果" class="headerlink" title="运行效果"></a>运行效果</h2><h2 id="后期工作"><a href="#后期工作" class="headerlink" title="后期工作"></a><a href="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片4.png"><img src="http://blog.zhengzi.me/wp-content/uploads/2016/07/图片4.png" alt="图片4"></a>后期工作</h2><p>在爬取之后还应该使用Supervisor或者Scrapyd进行托管，以更稳定的方式运行。之后有空会继续更新一套可视化爬虫工具并开源。</p>
<h2 id="参考"><a href="#参考" class="headerlink" title="参考"></a>参考</h2><ul>
<li>Scrpay文档：<a href="http://doc.scrapy.org/en/1.1/index.html" target="_blank" rel="noopener">http://doc.scrapy.org/en/1.1/index.html</a></li>
<li>Scrapyd文档：<a href="https://scrapyd.readthedocs.io/en/latest/index.html" target="_blank" rel="noopener">https://scrapyd.readthedocs.io/en/latest/index.html</a></li>
<li>Scrpay使用笔记：<a href="http://www.q2zy.com/articles/2015/12/15/note-of-scrapy/" target="_blank" rel="noopener">http://www.q2zy.com/articles/2015/12/15/note-of-scrapy/</a></li>
<li>Scrapy使用以及Xpath的一些坑：<a href="http://www.tuicool.com/articles/EVfqI3" target="_blank" rel="noopener">http://www.tuicool.com/articles/EVfqI3</a></li>
<li>amazon图书爬虫（非增量式，已开源）：<a href="https://github.com/imaginezz/amazonBookSpider" target="_blank" rel="noopener">https://github.com/imaginezz/amazonBookSpider</a></li>
</ul>

      
    </div>

    

    
    
    

    

    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/Python/" rel="tag"># Python</a>
          
            <a href="/tags/Scrapy/" rel="tag"># Scrapy</a>
          
            <a href="/tags/增量/" rel="tag"># 增量</a>
          
            <a href="/tags/爬虫/" rel="tag"># 爬虫</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2016/07/18/e4-bd-bf-e7-94-a8azure-e5-bf-ab-e9-80-9f-e6-90-ad-e5-bb-bass-e6-9c-8d-e5-8a-a1-e5-99-a8/" rel="next" title="使用Azure快速搭建SS服务器">
                <i class="fa fa-chevron-left"></i> 使用Azure快速搭建SS服务器
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2016/08/28/hololens-e5-bc-80-e5-8f-91-e5-88-9d-e4-bd-93-e9-aa-8c/" rel="prev" title="HoloLens开发初体验">
                HoloLens开发初体验 <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>


  </div>


          </div>
          

  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            Table of Contents
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            Overview
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image"
                src="/images/avatar.jpg"
                alt="Jason Bian" />
            
              <p class="site-author-name" itemprop="name">Jason Bian</p>
              <p class="site-description motion-element" itemprop="description"></p>
          </div>

          
            <nav class="site-state motion-element">
              
                <div class="site-state-item site-state-posts">
                
                  <a href="/archives/">
                
                    <span class="site-state-item-count">86</span>
                    <span class="site-state-item-name">posts</span>
                  </a>
                </div>
              

              
                
                
                <div class="site-state-item site-state-categories">
                  
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">41</span>
                    <span class="site-state-item-name">categories</span>
                  
                </div>
              

              
                
                
                <div class="site-state-item site-state-tags">
                  
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">109</span>
                    <span class="site-state-item-name">tags</span>
                  
                </div>
              
            </nav>
          

          

          

          
          

          
          

          
            
          
          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#需求分析"><span class="nav-number">1.</span> <span class="nav-text">需求分析</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#爬虫选择"><span class="nav-number">2.</span> <span class="nav-text">爬虫选择</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#以云起书院为例来分析相关网站的模式"><span class="nav-number">3.</span> <span class="nav-text">以云起书院为例来分析相关网站的模式</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Scrapy工作方式"><span class="nav-number">4.</span> <span class="nav-text">Scrapy工作方式</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#分层增量爬取架构"><span class="nav-number">5.</span> <span class="nav-text">分层增量爬取架构</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#利用Shell指令实现自动增量爬取"><span class="nav-number">6.</span> <span class="nav-text">利用Shell指令实现自动增量爬取</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#实现爬虫每天自动运行"><span class="nav-number">6.1.</span> <span class="nav-text">实现爬虫每天自动运行</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#每次开启任务，按照广度优先进行分层爬取"><span class="nav-number">6.2.</span> <span class="nav-text">每次开启任务，按照广度优先进行分层爬取</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#运行效果"><span class="nav-number">7.</span> <span class="nav-text">运行效果</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#后期工作"><span class="nav-number">8.</span> <span class="nav-text">后期工作</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#参考"><span class="nav-number">9.</span> <span class="nav-text">参考</span></a></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; <span itemprop="copyrightYear">2018</span>
  <span class="with-love" id="animate">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">Jason Bian</span>

  

  
</div>




  <div class="powered-by">Powered by <a class="theme-link" target="_blank" href="https://hexo.io">Hexo</a></div>



  <span class="post-meta-divider">|</span>



  <div class="theme-info">Theme – <a class="theme-link" target="_blank" href="https://theme-next.org">NexT.Gemini</a></div>




        








        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    
	
    

    
  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>


























  
  
    <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  


  


  <script type="text/javascript" src="/js/src/utils.js?v=6.4.0"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=6.4.0"></script>



  
  


  <script type="text/javascript" src="/js/src/affix.js?v=6.4.0"></script>

  <script type="text/javascript" src="/js/src/schemes/pisces.js?v=6.4.0"></script>



  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=6.4.0"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=6.4.0"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=6.4.0"></script>



  



  










  





  

  

  

  

  
  

  

  

  

  

  

</body>
</html>
